library(tidyverse)
library(h2o)
library(DALEX)
library(DALEXtra)
theme_set(theme_minimal())
We are going to assess credit risk using a dataset on credit applications made by German citizens. The cleaned version of this classic dataset is available on Kaggle has a total of 1000 instances with a range of features such as age, employment status, credit history, loan amount, and purpose of the loan (see more detail about the features here). By analyzing the patterns and trends in the dataset, financial institutions can make informed decisions about whether to approve or reject credit applications and manage their risk effectively.
credit_data <- read_csv("../data/german_credit_risk/german_credit_data.csv")
## New names:
## Rows: 1000 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Delimiter: "," chr
## (6): Sex, Housing, Saving accounts, Checking account, Purpose, Risk dbl (5): ...1, Age, Job, Credit amount, Duration
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
skimr::skim(credit_data)
| Name | credit_data |
| Number of rows | 1000 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| Housing | 0 | 1.00 | 3 | 4 | 0 | 3 | 0 |
| Saving accounts | 183 | 0.82 | 4 | 10 | 0 | 4 | 0 |
| Checking account | 394 | 0.61 | 4 | 8 | 0 | 3 | 0 |
| Purpose | 0 | 1.00 | 3 | 19 | 0 | 8 | 0 |
| Risk | 0 | 1.00 | 3 | 4 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 499.50 | 288.82 | 0 | 249.8 | 499.5 | 749.2 | 999 | ▇▇▇▇▇ |
| Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.0 | 75 | ▇▆▃▁▁ |
| Job | 0 | 1 | 1.90 | 0.65 | 0 | 2.0 | 2.0 | 2.0 | 3 | ▁▂▁▇▂ |
| Credit amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.2 | 18424 | ▇▂▁▁▁ |
| Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.0 | 72 | ▇▇▂▁▁ |
credit_data <- select(credit_data, -1) |> # drop the ID column
rename_with(~tolower(gsub(" ", "_", .x, fixed = TRUE))) |>
mutate(
risk = as.factor(ifelse(risk == "bad", 1, 0)),
job = case_when(
job == 0 ~ "unskilled and non-resident",
job == 1 ~ "unskilled and resident",
job == 2 ~ "skilled",
job == 3 ~ "highly skilled"
),
across(c(saving_accounts, checking_account), ~ifelse(is.na(.x), "missing", .x)),
across(where(is.character), as.factor)
)
skimr::skim(credit_data)
| Name | credit_data |
| Number of rows | 1000 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 7 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | mal: 690, fem: 310 |
| job | 0 | 1 | FALSE | 4 | ski: 630, uns: 200, hig: 148, uns: 22 |
| housing | 0 | 1 | FALSE | 3 | own: 713, ren: 179, fre: 108 |
| saving_accounts | 0 | 1 | FALSE | 5 | lit: 603, mis: 183, mod: 103, qui: 63 |
| checking_account | 0 | 1 | FALSE | 4 | mis: 394, lit: 274, mod: 269, ric: 63 |
| purpose | 0 | 1 | FALSE | 8 | car: 337, rad: 280, fur: 181, bus: 97 |
| risk | 0 | 1 | FALSE | 2 | 0: 700, 1: 300 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 35.55 | 11.38 | 19 | 27 | 33 | 42 | 75 | ▇▆▃▁▁ |
| credit_amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1366 | 2320 | 3972 | 18424 | ▇▂▁▁▁ |
| duration | 0 | 1 | 20.90 | 12.06 | 4 | 12 | 18 | 24 | 72 | ▇▇▂▁▁ |
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 41 minutes 5 seconds
## H2O cluster timezone: Europe/Budapest
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 2 months and 4 days
## H2O cluster name: H2O_started_from_R_i525503_njy323
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.80 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.2 (2022-10-31)
my_seed <- 20230329
data_split <- h2o.splitFrame(as.h2o(credit_data), ratios = 0.75, seed = my_seed)
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
credit_train <- data_split[[1]]
credit_holdout <- data_split[[2]]
target <- "risk"
features <- setdiff(names(credit_data), target)
# Train the best classification model you can come up with (optimize for AUC)
my_model <- h2o.randomForest(
x = features, y = target,
training_frame = credit_train,
nfolds = 5,
seed = my_seed
)
##
|
| | 0%
|
|============================================================================================================================================================== | 83%
|
|==============================================================================================================================================================================================| 100%
h2o.performance(my_model, xval = TRUE)
## H2OBinomialMetrics: drf
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.185
## RMSE: 0.4301
## LogLoss: 0.5917
## Mean Per-Class Error: 0.3386
## AUC: 0.7172
## AUCPR: 0.5003
## Gini: 0.4343
## R^2: 0.09847
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 279 249 0.471591 =249/528
## 1 44 170 0.205607 =44/214
## Totals 323 419 0.394879 =293/742
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.235000 0.537125 161
## 2 max f2 0.046667 0.695452 222
## 3 max f0point5 0.470000 0.515184 74
## 4 max accuracy 0.545714 0.733154 55
## 5 max precision 0.960000 1.000000 0
## 6 max recall 0.000000 1.000000 231
## 7 max specificity 0.960000 1.000000 0
## 8 max absolute_mcc 0.470000 0.306791 74
## 9 max min_per_class_accuracy 0.306667 0.655303 131
## 10 max mean_per_class_accuracy 0.283333 0.664357 141
## 11 max tns 0.960000 528.000000 0
## 12 max fns 0.960000 212.000000 0
## 13 max fps 0.000000 528.000000 231
## 14 max tps 0.000000 214.000000 231
## 15 max tnr 0.960000 1.000000 0
## 16 max fnr 0.960000 0.990654 0
## 17 max fpr 0.000000 1.000000 231
## 18 max tpr 0.000000 1.000000 231
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.confusionMatrix(my_model, xval = TRUE)
h2o.varimp(my_model)
h2o.varimp_plot(my_model)
# gives useful information: residual analysis, variable importance, SHAP Summary, PDP-s -- but hard to customize
h2o.explain(my_model, credit_holdout)
##
##
## Confusion Matrix
## ================
##
## > Confusion matrix shows a predicted class vs an actual class.
##
##
##
## DRF_model_R_1681335958994_1159
## ------------------------------
##
## | | 0 | 1 | Error | Rate
## |:---:|:---:|:---:|:---:|:---:|
## | **0** |105 | 67 | 0.38953488372093 | =67/172 |
## | **1** |16 | 70 | 0.186046511627907 | =16/86 |
## | **Totals** |121 | 137 | 0.321705426356589 | =83/258 |
##
##
## Learning Curve Plot
## ===================
##
## > Learning curve plot shows the loss function/metric dependent on number of iterations or trees for tree-based algorithms. This plot can be useful for determining whether the model overfits.
##
##
## Variable Importance
## ===================
##
## > The variable importance plot shows the relative importance of the most important variables in the model.
##
##
## SHAP Summary
## ============
##
## > SHAP summary plot shows the contribution of the features for each instance (row of data). The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function.
##
##
## Partial Dependence Plots
## ========================
##
## > Partial dependence plot (PDP) gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. PDP assumes independence between the feature for which is the PDP computed and the rest.
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## ℹ The deprecated feature was likely used in the h2o package.
## Please report the issue at <]8;;https://h2oai.atlassian.net/projects/PUBDEVhttps://h2oai.atlassian.net/projects/PUBDEV]8;;>.
## Diagnostics with Dalex
dalex_explainer <- explain_h2o(my_model, data = credit_holdout[features], y = as.numeric(credit_holdout[[target]]))
## Preparation of a new explainer is initiated
## -> model label : H2OBinomialModel ( default )
## -> data : 258 rows 9 cols
## -> target variable : 258 values
## -> predict function : yhat.H2OBinomialModel will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
## -> model_info : package h2o , ver. 3.40.0.1 , task classification ( default )
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
## -> predicted values : numerical, min = 0 , mean = 0.32 , max = 0.95
## -> residual function : difference between y and yhat ( default )
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
## -> residuals : numerical, min = -0.95 , mean = 0.0133 , max = 1
## A new explainer has been created!
class(dalex_explainer)
## [1] "explainer"
summary(dalex_explainer)
## Length Class Mode
## model 1 H2OBinomialModel S4
## data 9 data.frame list
## y 258 -none- numeric
## predict_function 1 -none- function
## y_hat 258 -none- numeric
## residuals 258 -none- numeric
## class 1 -none- character
## label 1 -none- character
## model_info 3 model_info list
## residual_function 1 -none- function
## weights 0 -none- NULL
Dalex calculates permutation-based feature importance
(model-agnostic). The parameter B stands for the number of
permutations. It shows how much the model’s performance would change if
we did not include the given variable.
dalex_vip <- model_parts(dalex_explainer, B = 20) # takes a while
plot(dalex_vip)
### Partial Depedence Plots
Partial Dependence Plots express how the change of a feature influences the prediction.
PDP is sensitive to correlated features. Accumulated Local Effect expresses the similar idea but in a more robust way. See chapters 17-18 of Biecek-Burzykowski for more detail.
pdp_age <- model_profile(dalex_explainer, variables = "age", type = "partial") # default: partial
plot(pdp_age, geom = "profiles")
pdp_age_accumulated <- model_profile(dalex_explainer, variables = "age", type = "accumulated")
plot(pdp_age_accumulated, geom = "profiles")
pdp_numeric <- model_profile(dalex_explainer, variable_type = "numerical")
## Warning in FUN(X[[i]], ...): Variable: < credit_amount > has more than 201 unique values and all of them will be used as variable splits in calculating variable profiles. Use the `variable_splits`
## parameter to mannualy change this behaviour. If you believe this warning to be a false positive, raise issue at <https://github.com/ModelOriented/ingredients/issues>.
plot(pdp_numeric)
plot(pdp_numeric, geom = "points")
plot(model_profile(dalex_explainer, variable_type = "categorical"))
## Warning in FUN(X[[i]], ...): Variable: < credit_amount > has more than 201 unique values and all of them will be used as variable splits in calculating variable profiles. Use the `variable_splits`
## parameter to mannualy change this behaviour. If you believe this warning to be a false positive, raise issue at <https://github.com/ModelOriented/ingredients/issues>.
Why a given instance gets its prediction. How could it be changed?
obs_of_interest <- as_tibble(credit_holdout)[5, ]
obs_of_interest
h2o.predict(my_model, as.h2o(obs_of_interest[, features]))
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
##
|
| | 0%
|
|==============================================================================================================================================================================================| 100%
## predict p0 p1
## 1 1 0.12 0.88
##
## [1 row x 3 columns]
model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
lime_explanation <- predict_surrogate(
explainer = dalex_explainer,
new_observation = obs_of_interest[, features], # needs to use a normal df not an H2OFrame!
type = "lime",
n_features = 10, # default: 4
seed = my_seed # samples for permutations - still not reproducible :(
)
plot(lime_explanation)
Pro:
Con:
# Shapley is most suitable for models with a small or moderate number of explanatory variables
shapley <- predict_parts(
explainer = dalex_explainer,
new_observation = obs_of_interest[, features],
type = "shap",
B = 20 # number of random orderings to aggregate (default: 25)
)
plot(shapley)
Pro:
Con: